library(AppliedPredictiveModeling) # Data sets
library(tidyverse) # Oppan tidy style
library(caret) # Modeling
library(e1071) # skewness
data(segmentationOriginal)
segmentationOriginal <- as_tibble(segmentationOriginal)
segmentationOriginal
seg_data <- subset(segmentationOriginal, Case == "Train")
seg_data
cell_id <- seg_data$Case
class <- seg_data$Class
case <- seg_data$Case
seg_data <- seg_data[, -(1:3)]
seg_data %>% select(-contains("Status")) -> seg_data
seg_data
Skewness
skewness(seg_data$AngleCh1)
[1] -0.02426252
#seg_data %>% map_dfr(skewness)
summarize_all(seg_data, skewness)
Box-Cox transform
Ch1AreaTrans <- BoxCoxTrans(seg_data$AreaCh1)
Ch1AreaTrans
Box-Cox Transformation
1009 data points used to estimate Lambda
Input data summary:
Min. 1st Qu. Median Mean 3rd Qu. Max.
150.0 194.0 256.0 325.1 376.0 2186.0
Largest/Smallest: 14.6
Sample Skewness: 3.53
Estimated Lambda: -0.9
Apply the transform with the predict function
predict(Ch1AreaTrans, head(seg_data$AreaCh1)) -> dat
dat
[1] 1.108458 1.106383 1.104520 1.103554 1.103607 1.105523
Or perform it all at once via caret::preProcess
pca_object <- prcomp(seg_data, center = TRUE, scale = TRUE)
percent_variance <- pca_object$sdev^2/sum(pca_object$sdev^2)*100
percent_variance[1:3]
[1] 20.91236 17.01330 11.88689
Near zero variance
nearZeroVar(seg_data)
integer(0)
Correlations
correlations <- cor(seg_data)
dim(correlations)
[1] 58 58
correlations[1:4, 1:4]
AngleCh1 AreaCh1 AvgIntenCh1 AvgIntenCh2
AngleCh1 1.000000000 -0.002627172 -0.04300776 -0.01944681
AreaCh1 -0.002627172 1.000000000 -0.02529739 -0.15330301
AvgIntenCh1 -0.043007757 -0.025297394 1.00000000 0.52521711
AvgIntenCh2 -0.019446810 -0.153303007 0.52521711 1.00000000
corrplot::corrplot(correlations, order = "hclust")
skimr::skim(Glass)
Skim summary statistics
n obs: 214
n variables: 10
Variable type: factor
variable missing complete n n_unique top_counts ordered
Type 0 214 214 6 2: 76, 1: 70, 7: 29, 3: 17 FALSE
Variable type: numeric
variable missing complete n mean sd p0 p25 p50 p75 p100 hist
Al 0 214 214 1.44 0.5 0.29 1.19 1.36 1.63 3.5 ▁▂▇▅▂▁▁▁
Ba 0 214 214 0.18 0.5 0 0 0 0 3.15 ▇▁▁▁▁▁▁▁
Ca 0 214 214 8.96 1.42 5.43 8.24 8.6 9.17 16.19 ▁▂▇▂▁▁▁▁
Fe 0 214 214 0.057 0.097 0 0 0 0.1 0.51 ▇▁▁▁▁▁▁▁
K 0 214 214 0.5 0.65 0 0.12 0.56 0.61 6.21 ▇▁▁▁▁▁▁▁
Mg 0 214 214 2.68 1.44 0 2.11 3.48 3.6 4.49 ▃▁▁▁▁▁▇▁
Na 0 214 214 13.41 0.82 10.73 12.91 13.3 13.83 17.38 ▁▁▇▇▃▁▁▁
RI 0 214 214 1.52 0.003 1.51 1.52 1.52 1.52 1.53 ▁▅▇▂▁▁▁▁
Si 0 214 214 72.65 0.77 69.81 72.28 72.79 73.09 75.41 ▁▁▂▃▇▁▁▁
Predictor variables
select(Glass, -Type) %>% cor() %>% corrplot::corrplot()
caret::nearZeroVar(Glass)
integer(0)
summarize_if(Glass, .p = is.numeric, .f = e1071::skewness)
map_if(Glass, .p = is.numeric, .f = BoxCoxTrans) %>% map_dfr("lambda", .null=NA)